In [1]:
import networkx as nx
import pandas as pd
import pickle as pkl
import numpy as np
from numpy.random import RandomState  # random_state for networkX only for python3.6
rng = RandomState(787351)
import os
DATA_DIR = '../../data/data_schoolofinf'
In [2]:
# Graphing/Image
SAVE_GRAPHS = True
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use(['seaborn-poster'])
In [3]:
import logging
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from network_artist import *
from infnet_helper import *

Informatics Collaboration Network

In this notebook, we create and visualise three different models of the informatics network:

To create the informatics collaboration network, we need to do additional processing to the lookup tables (dataframe) for poinf and pub.

Setup lookup_pub

Filter the list of publications, limiting the dataset to a period from 1997-2017. Further constraints will be carried out later for each model.

In [4]:
# simply use the function:
lookup_pub = get_lookup_pub(1997,2017)
In [5]:
lookup_pub.head(3)
Out[5]:
year title authors shortnames nb_authors nb_shortnames collab_id edges collab_poinf nb_poinf
pub_id
18b1a861-afef-4fff-bc80-d02e05be18c4 2013 query processing in data integration [paolo guagliardo, piotr wieczorek] [guagliardo, p., wieczorek, p.] 2 2 [653c9723-b374-4ad3-9cef-0f7e7f45d812, 4886] [(653c9723-b374-4ad3-9cef-0f7e7f45d812, 4886)] [653c9723-b374-4ad3-9cef-0f7e7f45d812] 1
d5814bab-5fc2-4c31-92b7-543c7ce75cb4 2012 evaluation of speaker verification security an... [p.l. de leon, m. pucher, j. yamagishi, i. her... [leon, p. l., pucher, m., yamagishi, j., herna... 5 5 [3084, 4137, 4d11a99a-0d04-42f4-8089-d433a57c2... [(3084, 4137), (3084, 4d11a99a-0d04-42f4-8089-... [4d11a99a-0d04-42f4-8089-d433a57c2463] 1
880944d3-26db-4003-9186-130bf3202941 2014 openairinterface a flexible platform for g res... [navid nikaein, mahesh k. marina, saravana man... [nikaein, n., marina, m. k., manickam, s., daw... 6 6 [796, 06760916-edca-488e-93da-baff6204a453, 54... [(796, 06760916-edca-488e-93da-baff6204a453), ... [06760916-edca-488e-93da-baff6204a453] 1
In [6]:
lookup_pub.info()
<class 'pandas.core.frame.DataFrame'>
Index: 8028 entries, 18b1a861-afef-4fff-bc80-d02e05be18c4 to b2920a27-5293-4f4a-8874-4a0ea804d91a
Data columns (total 10 columns):
year             8028 non-null int64
title            8024 non-null object
authors          8028 non-null object
shortnames       8028 non-null object
nb_authors       8028 non-null int64
nb_shortnames    8028 non-null int64
collab_id        8028 non-null object
edges            8028 non-null object
collab_poinf     8028 non-null object
nb_poinf         8028 non-null int64
dtypes: int64(4), object(6)
memory usage: 689.9+ KB

We can add the year that an individual makes his/her first publication. This would allow us to create a temporal evolution of the network graph.

Setup lookup_poinf

In [208]:
_lookup_poinf = get_lookup_poinf()
In [7]:
# using the function instead:
lookup_poinf = get_lookup_poinf()
# remove individuals who does not have any publication (i.e. first_pub_year == 0)
lookup_poinf.dropna(axis=0, subset=['first_pub_year'], inplace=True)
lookup_poinf = lookup_poinf[(lookup_poinf.total_pub > 0)
                            & (lookup_poinf.total_pub_1997 > 0)]
In [8]:
lookup_poinf.info()
<class 'pandas.core.frame.DataFrame'>
Index: 224 entries, d089bb44-7d0b-4b3f-b556-27def993d1d1 to ccd9cd8c-9aee-4ede-a388-f82641ee8d29
Data columns (total 14 columns):
last_name          224 non-null object
first_name         224 non-null object
personal_url       224 non-null object
position           224 non-null object
parent             224 non-null object
institute          224 non-null object
full_name          224 non-null object
institute_class    224 non-null int64
alias              224 non-null object
first_pub_year     224 non-null float64
total_pub          224 non-null int64
total_pub_1997     224 non-null int64
total_pub_2012     224 non-null int64
pub_ids            224 non-null object
dtypes: float64(1), int64(4), object(9)
memory usage: 26.2+ KB

Exploring both lookup tables

In [9]:
no_poinf = len(lookup_pub[lookup_pub.nb_poinf == 0])
ratio_ = no_poinf / len(lookup_pub)
print(no_poinf, ratio_)
240 0.029895366218236172
In [10]:
one_poinf_only = len(lookup_pub[lookup_pub.nb_poinf == 1])
ratio_ = one_poinf_only / len(lookup_pub)
print(one_poinf_only, ratio_)
6661 0.8297209765819631
In [11]:
with_poinf_infnet20yr = len(lookup_pub[lookup_pub.nb_poinf.ge(2)])
print((with_poinf_infnet20yr, len(lookup_pub)))
print((with_poinf_infnet20yr / len(lookup_pub)))
(1127, 8028)
0.1403836571998007
In [12]:
lookup_pub_infnet6yr = lookup_pub[lookup_pub.year.ge(2012)]
with_poinf_6yr = len(
    lookup_pub_infnet6yr[lookup_pub_infnet6yr.nb_poinf.ge(2)])
print((with_poinf_6yr, len(lookup_pub_infnet6yr)))
print((with_poinf_6yr / len(lookup_pub_infnet6yr)))
(657, 3620)
0.1814917127071823
In [13]:
x = lookup_pub[lookup_pub.nb_poinf.ge(4)]
x[x.nb_poinf == x.nb_authors]
Out[13]:
year title authors shortnames nb_authors nb_shortnames collab_id edges collab_poinf nb_poinf
pub_id
b3c5d1b3-3645-44c9-b879-19d3f74386b9 2012 digitised historical text does it have to be m... [beatrice alex, claire grover, ewan klein, ric... [alex, b., grover, c., klein, e., tobin, r.] 4 4 [ecd799fb-4f63-44ae-a078-b009099f2c8c, 7187cc1... [(ecd799fb-4f63-44ae-a078-b009099f2c8c, 7187cc... [ecd799fb-4f63-44ae-a078-b009099f2c8c, 7187cc1... 4
62b37c1e-5232-42e5-926d-937a1fa90f14 2017 simbench a portable benchmarking methodology f... [harry wagstaff, bruno bodin, tom spink, bjoer... [wagstaff, h., bodin, b., spink, t., franke, b.] 4 4 [489d4278-a0a6-4e8b-857a-4ee0e800766f, 412b9b6... [(489d4278-a0a6-4e8b-857a-4ee0e800766f, 412b9b... [489d4278-a0a6-4e8b-857a-4ee0e800766f, 412b9b6... 4
dc63b9e8-2fd6-4fbe-a57a-393423b0aa07 2002 xmlbased nlp tools for analysing and annotatin... [claire grover, ewan klein, mirella lapata, al... [grover, c., klein, e., lapata, m., lascarides... 4 4 [7187cc1d-738f-412c-908f-328256e1008e, 1458e0e... [(7187cc1d-738f-412c-908f-328256e1008e, 1458e0... [7187cc1d-738f-412c-908f-328256e1008e, 1458e0e... 4
536d5547-d618-4729-9d28-40a82afef6ca 2017 imperative functional programs that explain th... [wilmer ricciotti, jan stolarek, roly perera, ... [ricciotti, w., stolarek, j., perera, r., chen... 4 4 [f66a2291-496c-4371-a060-ef2d68c379f3, 31e35c9... [(f66a2291-496c-4371-a060-ef2d68c379f3, 31e35c... [f66a2291-496c-4371-a060-ef2d68c379f3, 31e35c9... 4
5017a420-5ec5-4a6a-af51-5c577031c46a 2015 efficient dualisa support in a retargetable as... [tom spink, harry wagstaff, bjoern franke, nig... [spink, t., wagstaff, h., franke, b., topham, n.] 4 4 [0d940898-e2a7-4262-bf06-5b146fb79ba2, 489d427... [(0d940898-e2a7-4262-bf06-5b146fb79ba2, 489d42... [0d940898-e2a7-4262-bf06-5b146fb79ba2, 489d427... 4
57f5170a-5106-4ae6-9546-ed06ef074d3d 2014 efficient code generation in a regionbased dyn... [tom spink, harry wagstaff, björn franke, nige... [spink, t., wagstaff, h., franke, b., topham, n.] 4 4 [0d940898-e2a7-4262-bf06-5b146fb79ba2, 489d427... [(0d940898-e2a7-4262-bf06-5b146fb79ba2, 489d42... [0d940898-e2a7-4262-bf06-5b146fb79ba2, 489d427... 4
00d069a4-1e03-4fa1-bd7d-d208d3929df7 2016 utilising disaggregated energy data in feedbac... [nigel goddard, martin pullinger, lynda webb, ... [goddard, n., pullinger, m., webb, l., farrow,... 8 8 [053590d0-39d7-4a42-b42d-61ee8d743d3e, 0346dc9... [(053590d0-39d7-4a42-b42d-61ee8d743d3e, 0346dc... [053590d0-39d7-4a42-b42d-61ee8d743d3e, 0346dc9... 8

Import institutes information

Class labels for each institute

In [14]:
INSTITUTES = get_institute()
In [15]:
# separate the individuals by community:
gb = lookup_poinf.groupby('institute_class')
# Number of individuals in each classes:
for k, group in gb:
    className = [name for (name, _k) in list(INSTITUTES.items()) if _k == k][0]
    print(('class {}-{}: {}'.format(k, className, len(group))))
class 0-UNKNOWN: 10
class 1-centre for intelligent systems and their applications: 21
class 2-institute of language cognition and computation: 52
class 3-laboratory for foundations of computer science: 59
class 4-institute for adaptive and neural computation: 28
class 5-institute for computing systems architecture: 24
class 6-neuroinformatics dtc: 13
class 7-institute of perception action and behaviour: 15
class 8-deanery of clinical sciences: 1
class 9-school of philosophy psychology and language sciences: 1

The class distribution is uneven! Classes 1, 2, 3, 4, 5, 6, and 7 are the institutes in the school of informatics. Additional classes are found, but they are relatively small, except for the UNKNOWN class which contains individuals who's institutes are not present.

In [16]:
print(('Number of individuals in informatics:', len(lookup_poinf)))
('Number of individuals in informatics:', 224)

Simple collaboration network

This section, we only consider a simple undirected graph of the informatics collaboration network

In [17]:
all_edges = []
for listEdges in lookup_pub.edges:
    all_edges.extend(listEdges)
In [18]:
# Since the graph of interest is simple ud, we remove all repeated edges
unique_edges = set()
for (i1, i2) in all_edges:
    assert i1 != i2, "SELF LOOPS DETECTED!"

    # We are only interested in an simple undirected graph!
    # if the same edge exists, we can ignore
    if (i2, i1) in unique_edges:
        continue
    else:
        # adding another (i1,i2) into the set will only count as one
        unique_edges.add((i1, i2))
        
print('number of unique edge pairs: ', len(unique_edges))

# Filter all edges pairs if there is only one or none individuals from school of informatics
poinf_edges_only = [(au1, au2) for (au1, au2) in unique_edges
                    if au1 in lookup_poinf.index and au2 in lookup_poinf.index]

print('number of unique edge pairs (informatics only): ', len(poinf_edges_only))
number of unique edge pairs:  54269
number of unique edge pairs (informatics only):  471

infnet-20yr

g_poinf_only: the informatics collaboration graph from 1997-2017

In [19]:
g_infnet20yr = nx.from_edgelist(poinf_edges_only)
In [20]:
print(nx.info(g_infnet20yr))
Name: 
Type: Graph
Number of nodes: 195
Number of edges: 471
Average degree:   4.8308

Generating the adjacency matrix using graph:

In [21]:
g_infnet20yr_nodeorder = sorted(list(g_infnet20yr.nodes))
In [22]:
adjmat_infnet20yr = nx.adj_matrix(g_infnet20yr,nodelist=g_infnet20yr_nodeorder)
In [23]:
adj_mat, fig, order = create_adj_mat(
    g_infnet20yr, g_infnet20yr_nodeorder, draw=True, use_order=True)
if SAVE_GRAPHS:
    plt.savefig(
        'IMG/infnet20yr_adj_mat.png',
        format='png',
        transparent=True,
        bbox_inches='tight')
In [24]:
adj_mat.dump(os.path.join(DATA_DIR, 'mat', 'infnet20yrs-adj-mat.pkl'))
In [25]:
# Save the list of individuals in the graph:
with open(os.path.join(DATA_DIR, 'poinf_collabgraph_1997-2017.txt'), 'w') as f:
    f.write("\n".join(list(g_infnet20yr_nodeorder)))

NOTE: The number of individuals with a publication is expected to by 228; however, in our graph, only 194 nodes are present. Why? Because we filtered the graph according to the edges where both are individuals are present.

Drawing Network

Here, we illustrate each network using different layouts

Spring Layout overall

In [122]:
# this is the pos for the entire network! we can use this to maintain the visualiation across different experiements
pos_full = nx.kamada_kawai_layout(g_infnet20yr)
In [123]:
f = plt.figure(figsize=(8, 8))
ax = f.add_subplot(111)
ax.axis('off')
nx.draw(
    g_infnet20yr,
    pos=pos_full,
    ax=ax,
    node_size=50,
    alpha=.8,
    edge_color='#999966',
    node_color=color_by_inst(g_infnet20yr, lookup_poinf))
In [124]:
f.savefig('IMG/infnet20yr_spring.png',format='png',bbox_inches='tight')

Circular Layout for each class

In [40]:
draw_default_layout(
    g_infnet20yr, lookup_poinf, file_prefix='infnet20yr', with_weight=False)
2018-04-05 17:11:55,950 : INFO : SAVE_GRAPHS: False

Shell Layout

In [42]:
# visualise:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
ax.axis('off')
nx.draw_networkx(
    g_infnet20yr,
    pos=nx.shell_layout(g_infnet20yr, get_default_nlist(g_infnet20yr,lookup_poinf,as_dict=False)),
    with_labels=False,
    ax=ax,
    edge_color='#999966',
    node_size=40,
    node_color=color_by_inst(g_infnet20yr, lookup_poinf))

if SAVE_GRAPHS:
#     ax.set_title('Informatics Collaboration Network from 1997-2017')
#     ax = add_inst_labels(ax) #You can include the label by uncommenting `ax = add_inst_label(ax)`
    #plt.savefig("IMG/infnet20yr_shell.pdf", format='pdf', bbox_inches="tight")
    plt.savefig("IMG/infnet20yr_shell.png", format='png', bbox_inches="tight")

Circular Layout

ordered according to institute

In [43]:
draw_circular_layout(g_infnet20yr, lookup_poinf, file_prefix='infnet20yr')
2018-04-05 17:12:29,873 : INFO : SAVE_GRAPHS: False

3D network graph

In [45]:
import igraph as ig
from plotly.graph_objs import *
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.offline as offline
offline.init_notebook_mode()
In [47]:
igraph_G = ig.Graph.TupleList(g_infnet20yr.edges)
layt = igraph_G.layout('kk', dim=3)
vs = igraph_G.vs
In [49]:
TRACES = []

_data = {}
# VERTICES
for _id in g_infnet20yr.nodes:
    _idx = vs.find(_id).index
    _k = lookup_poinf.loc[_id].institute_class  # institute_class of individual
    if _k in list(_data.keys()):
        _data[_k]['Xn'].append(layt[_idx][0])  # x-coordinates
        _data[_k]['Yn'].append(layt[_idx][1])  # y-coordinates
        _data[_k]['Zn'].append(layt[_idx][2])  # z-coordinates
        _data[_k]['ids'].append(_id)
    else:
        _data[_k] = {
            'Xn': [layt[_idx][0]],
            'Yn': [layt[_idx][1]],
            'Zn': [layt[_idx][2]],
            "ids": [_id]
        }

# Scatter nodes
for k in sorted(_data.keys()):
    v = _data[k]
    labels = lookup_poinf.loc[v['ids']].full_name.tolist()
    _trace = Scatter3d(
        x=v['Xn'],
        y=v['Yn'],
        z=v['Zn'],
        mode='markers',
        marker=Marker(
            symbol='dot',
            size=6,
            color=inst_by_color[k],
            line=Line(color='rgb(50,50,50)', width=0.5)),
        text=labels,
        hoverinfo='text',
        showlegend=True,
        name=[n for (n, _k) in list(INSTITUTES.items()) if _k == k][0])
    TRACES.append(_trace)
In [50]:
# Scatter edges
# EDGES
Xe, Ye, Ze = [], [], []
vs = igraph_G.vs
for (a, b) in g_infnet20yr.edges:
    _a_idx = vs.find(a).index
    _b_idx = vs.find(b).index
    # x-coordinates of edge ends
    Xe += [layt[_a_idx][0], layt[_b_idx][0], None]
    Ye += [layt[_a_idx][1], layt[_b_idx][1], None]
    Ze += [layt[_a_idx][2], layt[_b_idx][2], None]

trace1 = Scatter3d(
    x=Xe,
    y=Ye,
    z=Ze,
    mode='lines',
    line=Line(color='rgb(125,125,125)', width=2),
    hoverinfo="none",
    showlegend=False)
TRACES.append(trace1)
In [51]:
axis = dict(
    showbackground=False,
    showline=False,
    zeroline=False,
    showgrid=False,
    showticklabels=False,
    title="")
In [52]:
layout = Layout(
    title="Informatics Collaboration Network from 1997-2017",
    width=1000,
    height=1000,
    showlegend=True,
    legend=dict(x=0, y=0, xanchor='auto', yanchor='auto'),
    scene=Scene(
        xaxis=XAxis(axis),
        yaxis=YAxis(axis),
        zaxis=ZAxis(axis),
    ),
    margin=Margin(t=50),
    #     hovermode="x",
    #     xaxis={"range":[1997,2017], 'title':'Year'},
    #     sliders={
    #         'args':[
    #             'trainsition', {
    #                 'duration': 400,
    #                 'easing':'cubic-in-out'
    #             }
    #         ],
    #         'initialValue':'1997',
    #         'plotlycommand':'animate',
    #         'values':years,
    #         'visible':True
    #     }
    #     annotations=Annotations([
    #         Annotation(
    #             showarrow=False,
    #             text='Colored by institutes',
    #             xref='paper',
    #             yref='paper',
    #             x=0,
    #             y=0.1,
    #             xanchor='left',
    #             yanchor='bottom',
    #             font=Font(size=14))
    #     ]),
)
In [53]:
data=Data(TRACES)
fig=Figure(data=data, layout=layout)
offline.iplot(fig)

Collaboration within departments

In this section we visualise each of the seven institutes in School of Informatics

  • class 1-centre for intelligent systems and their applications
  • class 2-institute of language cognition and computation
  • class 3-laboratory for foundations of computer science
  • class 4-institute for adaptive and neural computation
  • class 5-institute for computing systems architecture
  • class 6-neuroinformatics dtc
  • class 7-institute of perception action and behaviour
In [39]:
gb_dept = lookup_poinf.groupby('institute_class')
In [40]:
# visualise:
fig = plt.figure(figsize=(9, 80))
ax = fig.add_subplot(8, 1, 1)
ax.axis('off')
ax.set_title(
    'School of Informatics Collaboration Network\n{} individuals\nAverage degree = {:.3f}'.
    format(
        len(g_poinf_only),
        (sum(dict(g_poinf_only.degree).values()) / len(g_poinf_only))))
nx.draw_networkx(
    g_poinf_only,
    pos=pos,
    with_labels=False,
    ax=ax,
    node_size=40,
    node_color=color_by_inst(g_poinf_only))

ks = [1, 2, 3, 4, 5, 6, 7]
for k in ks:
    inst = [name for (name, _k) in list(institutes.items()) if _k == k][0]
    ax = fig.add_subplot(8, 1, k + 1)
    ax.axis('off')
    individuals = gb_dept.get_group(k).index
    g = g_poinf_only.subgraph(individuals)
    nx.draw_networkx(
        g,
        pos=nx.spring_layout(g),
        with_labels=False,
        ax=ax,
        node_size=40,
        nodelist=g.nodes,
        node_color=color_by_inst(g))
    title = "{}\n{} individuals\nAverage degree = {:.3f}".format(
        inst, len(g),
        sum(dict(g.degree).values()) / len(g))
    ax.set_title(title)
    
if SAVE_GRAPHS:
    plt.savefig("IMG/infnet20yr_strat_department.pdf", format='pdf', bbox_inches="tight")

Development of infnet every 5 years

We can now observe the evolution of the informatics network on a time scale of 5 years

In [54]:
years = np.linspace(1997, 2017, num=5, dtype=int)
years
Out[54]:
array([1997, 2002, 2007, 2012, 2017])
In [197]:
fig = plt.figure(figsize=(50, 10))
i = 1

for yr in years[1:]:
    _lookup_poinf = lookup_poinf[lookup_poinf.total_pub_1997>0]
    df = _lookup_poinf.drop(
        _lookup_poinf[_lookup_poinf.first_pub_year > yr].index)
    
    individuals = list(df.index)
    g = g_infnet20yr.subgraph(individuals)
    ax = fig.add_subplot(1, 4, i)
    #     ax.set_title('End of {}'.format(yr))
    ax.axis('off')
    i += 1

    # calculate the shell layout
    nlist = [[] for i in range(12)]
    for node in g:
        c = int(lookup_poinf.institute_class.loc[[str(node)]])
        nlist[c].append(str(node))
    # sort the list according to the size of the list, so that larger circles are outside.
    nlist_dict = {len(a): [] for a in nlist}
    for lst in nlist:
        nlist_dict[len(lst)].extend(lst)

    length = list(nlist_dict.keys())
    # length.sort(reverse=True)
    _nlist = []
    for l in length:
        _nlist.append(nlist_dict[l])
    _nlist = [a for a in _nlist if len(a) > 0]
    
    nx.draw_networkx(
        g,
        #         pos=nx.shell_layout(g, _nlist),
        pos=pos_full,
        with_labels=False,
        ax=ax,
        #         edge_color='#999966',
        node_size=60,
        node_color=color_by_inst(g, lookup_poinf))
#     nx.draw_networkx(
#         g,
#         pos=pos,
#         with_labels=False,
#         ax=ax,
#         node_size=40,
#         nodelist=g.nodes,
#         node_color=color_by_inst(g))

if SAVE_GRAPHS:
    #         fig.suptitle('Evolution of Informatics Collaboration Network (1997-2017)')
    #         plt.savefig("IMG/infnet20yr_evolution.pdf", format='pdf', bbox_inches="tight")
    fig.savefig(
        "IMG/infnet20yr_evolution.png",
        format='png',
        bbox_inches="tight",
        transparent=True)
In [293]:
fig = plt.figure(figsize=(50, 10))
i = 1
for yr in years[1:]:
    df = lookup_poinf.drop(
        lookup_poinf[lookup_poinf.first_pub_year >= yr].index)
    individuals = list(df.index)
    g = g_infnet20yr.subgraph(individuals)
    ax = fig.add_subplot(1, 4, i)
    #     ax.set_title('End of {}'.format(yr))
    ax.axis('off')
    i += 1

    draw_circular_layout(g, lookup_poinf, file_prefix='evol_{}'.format(i), SAVE_GRAPHS=False)
2018-04-10 14:03:21,618 : INFO : SAVE_GRAPHS: False
2018-04-10 14:03:21,723 : INFO : SAVE_GRAPHS: False
2018-04-10 14:03:21,824 : INFO : SAVE_GRAPHS: False
2018-04-10 14:03:21,952 : INFO : SAVE_GRAPHS: False

Statistics

Degree distribution

In [215]:
nx.degree_histogram(g_infnet20yr)
Out[215]:
[0, 32, 35, 24, 20, 13, 23, 5, 13, 9, 5, 3, 5, 4, 1, 0, 0, 1, 1, 0, 0, 1]
In [58]:
ax, degree_seq = degree_dist(g_infnet20yr)
In [59]:
print(degree_seq[0]+degree_seq[1])
32
In [60]:
# we can observe the power-law fit to our degree distribution:
# $F(d) = (\frac{d}{d_{min}})^{-(\alpha-1)}$
fig = power_law_fit(degree_seq)
if SAVE_GRAPHS:
    plt.savefig("IMG/infnet20yr_degreeDist.pdf", format='pdf', bbox_inches="tight")
Values less than or equal to 0 in data. Throwing out 0 or negative values
Calculating best minimal value for power law fit
/afs/inf.ed.ac.uk/user/s14/s1450710/miniconda3/envs/infnet3/lib/python3.6/site-packages/powerlaw.py:697: RuntimeWarning:

invalid value encountered in true_divide

/afs/inf.ed.ac.uk/user/s14/s1450710/miniconda3/envs/infnet3/lib/python3.6/site-packages/powerlaw.py:697: RuntimeWarning:

divide by zero encountered in true_divide

ideally, a log-log plot on the ccdf with power-law should yield a straight line

Clustering coefficients

In [62]:
cc = clustering_coeff(g_infnet20yr)
print('average clustering coefficient: ', cc[1])
average clustering coefficient:  0.36711434367542967
In [212]:
nx.average_clustering(g_infnet20yr,count_zeros=False)
Out[212]:
0.4971340070604776
In [63]:
# The number of triangles for each nodes
transitivity_graph = nx.transitivity(g_infnet20yr)
print('Transitivity:', transitivity_graph)
Transitivity: 0.3070259865255053

Connected components

In [64]:
gccs, percentage = generateGCC(g_infnet20yr)
component 1: 96.92%
component 2: 1.03%
component 3: 1.03%
component 4: 1.03%
In [68]:
# display the connected components:
fig = plt.figure(figsize=(10, 10))
num_col = len(gccs) / 2
for i, g in enumerate(gccs, 1):
    percent = percentage[i - 1]
    ax = fig.add_subplot(num_col, 2, i)
    ax.axis('off')
    ax.set_title('Component {} ({:.2%})'.format(i, percent))
    nx.draw_networkx(
        g,
        node_color=color_by_inst(g,lookup_poinf),
        ax=ax,
        with_labels=False,
        node_size=30,
        edge_color='#999966',
        pos=pos_full)
if SAVE_GRAPHS:
    plt.savefig("IMG/infnet20yr_CC.pdf", format='pdf', bbox_inches="tight")

Giant Connected Component

Now, we analyse the giant connected component from the network

In [69]:
main_gcc = gccs[0]  # First element is our GCC as we have sorted in reverse
pos_gcc = nx.spring_layout(main_gcc)
In [70]:
print('number of nodes in largest connected component:', len(main_gcc))
number of nodes in largest connected component: 189
In [71]:
print(nx.info(main_gcc))
Name: 
Type: Graph
Number of nodes: 189
Number of edges: 468
Average degree:   4.9524
In [74]:
# Draw the Network
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
ax = add_inst_labels(ax)
ax.axis('off')
nx.draw_networkx(
    main_gcc,
    pos=pos_gcc,
    with_labels=False,
    ax=ax,
    node_size=40,
    edge_color='#999966',
    node_color=color_by_inst(main_gcc, lookup_poinf))
ax.set_title('Largest connected component ({:.2%})'.format(percentage[0]))
if SAVE_GRAPHS:
    plt.savefig("IMG/infnet20yr_LargestCC.pdf", format='pdf', bbox_inches="tight")

Degree Distribution

In [75]:
ax, degree_seq = degree_dist(main_gcc)
In [76]:
fig = power_law_fit(degree_seq)
if SAVE_GRAPHS:
    plt.savefig(
        "IMG/infnet20yr_LargestCC-degreeDist.pdf", format='pdf', bbox_inches="tight")
Values less than or equal to 0 in data. Throwing out 0 or negative values
Calculating best minimal value for power law fit
/afs/inf.ed.ac.uk/user/s14/s1450710/miniconda3/envs/infnet3/lib/python3.6/site-packages/powerlaw.py:697: RuntimeWarning:

invalid value encountered in true_divide

Clustering Coefficient

In [77]:
cc = clustering_coeff(main_gcc)
In [78]:
print('average clustering coefficient: ', cc[1])
average clustering coefficient:  0.37876876728417347

Average Path & Diameter

In [79]:
nx.average_shortest_path_length(main_gcc)  # AVERAGE PATH
Out[79]:
4.065799842395587
In [80]:
# Diameter:
print('Diameter of graph:', nx.diameter(main_gcc))
Diameter of graph: 10

Community detection

In [71]:
import community
In [ ]:
# Community detection using modularity
parts = community.best_partition(main_gcc)
values = [parts.get(node) for node in main_gcc.nodes()]
# assign each node to the community they belong to
In [ ]:
# Plot the networks side by side:
fig = plt.figure(figsize=(20, 10))
fig.suptitle(
    'Comparison between actual institutes and communities detected\n(Colors of nodes for graph on right is independent of those on left)'
)
ax1 = fig.add_subplot(121)
ax1.set_title('Actual communities in informatics collaboration network')
ax1.axis('off')
nx.draw_networkx(
    main_gcc,
    pos=pos_gcc,
    with_labels=False,
    ax=ax1,
    node_size=40,
    node_color=color_by_inst(main_gcc))
ax2 = fig.add_subplot(122)
ax2.set_title('Communities detected using modularity')
ax2.axis('off')
nx.draw_networkx(
    main_gcc,
    pos=pos_gcc,
    cmap=plt.get_cmap("rainbow"),
    ax=ax2,
    node_color=values,
    node_size=40,
    with_labels=False)
plt.savefig(
    "IMG/infnet_LargestCC_commCompare.pdf", format='pdf', bbox_inches="tight")
In [ ]:
num_comm = len(set(parts.values()))
print('Number of communities detected =', num_comm)
In [ ]:
# plot each community:
fig = plt.figure(figsize=(10, 30))
comms = set(parts.values())
for i, comm_id in enumerate(comms, 1):
    ax = fig.add_subplot(6, 2, i)
    # find those nodes belonging to this community:
    nodes_from_comm = [
        node_id for (node_id, c) in parts.items() if c == comm_id
    ]
    # Generate the subgrph belonging to these nodes:
    subG = nx.subgraph(main_gcc, nodes_from_comm)
    ax.axis('off')
    ax.set_title('Community {}'.format(comm_id))

    nx.draw_networkx(
        subG,
        pos=pos_gcc,
        ax=ax,
        node_color=color_by_inst(nodes_from_comm),
        node_size=20,
        with_labels=False)
ax = add_inst_labels(ax)
plt.savefig(
    "IMG/infnet_LargestCC_communities.pdf", format='pdf', bbox_inches="tight")

Influential individuals

Using betweness centrality as a mean to measure influence of node in the network

In [ ]:
bt = between_parallel(main_gcc)
In [ ]:
top = 11
In [ ]:
# we need to find the index of these max_nodes:
_nodes = list(main_gcc.nodes())
In [ ]:
max_nodes = sorted(bt.items(), key=lambda v: -v[1])[:top]
max_nodes
In [ ]:
# variables for plotting the network: values tell nx how big each node should be
bt_values = [10] * len(main_gcc.nodes())
bt_colors = ['xkcd:black'] * len(main_gcc.nodes())
for max_key, max_val in max_nodes:
    bt_values[_nodes.index(max_key)] = (
        max_val * 150)**2.2  # SCALE IT ACCORDINGLY
    bt_colors[_nodes.index(max_key)] = inst_by_color[int(
        pd_poinf.institute_class.loc[[str(max_key)]])]
In [ ]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
plt.axis("off")
plt.suptitle('The top 11 influential individuals in the GCC')
nx.draw_networkx(
    main_gcc,
    pos=pos_gcc,
    ax=ax,
    node_color=bt_colors,
    node_size=bt_values,
    with_labels=False)
plt.savefig("IMG/infnet_influencer.pdf", format='pdf', bbox_inches="tight")
In [ ]:
top_ids = [a[0] for a in max_nodes]
In [ ]:
pd_poinf.loc[top_ids]

infnet-6yr Model :: 2012-2017

In [81]:
# What are the years?
print(sorted(pd.unique(lookup_pub.year), reverse=True))
[2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000, 1999, 1998, 1997]
In [82]:
# There are some dates instead of just purely years.
# they all fall out of the 6 year period that we are interestted in (2017-2012)
# Grouping the publications by year:
gb = lookup_pub.groupby('year')
sixYears = [2017, 2016, 2015, 2014, 2013, 2012]
pd_years = {}
print("Year:\tPub count")
for yr, group in gb:
    if yr in sixYears:
        pd_years[yr] = group
        print(("{}:\t{}".format(yr, len(group))))
Year:	Pub count
2012:	574
2013:	624
2014:	631
2015:	586
2016:	651
2017:	554
In [83]:
combined_yrs = pd.concat(pd_years.values())
print("Total publications: ", len(combined_yrs))
Total publications:  3620
In [84]:
all_edges_6yr = []
for listEdges in combined_yrs.edges:
    all_edges_6yr.extend(listEdges)
print('total number of edges: ', len(all_edges_6yr))
total number of edges:  38580
In [85]:
# Since the graph of interest is simple ud, we remove all repeated edges
unique_edges_6yr = set()
for (i1, i2) in all_edges_6yr:
    assert i1 != i2, "SELF LOOPS DETECTED!"

    # We are only interested in an simple undirected graph!
    # if the same edge exists, we can ignore
    if (i2, i1) in unique_edges_6yr:
        continue
    else:
        # adding another (i1,i2) into the set will only count as one
        unique_edges_6yr.add((i1, i2))
In [86]:
print('number of unique edge pairs: ', len(unique_edges_6yr))
number of unique edge pairs:  28026
In [87]:
# Filter all edges pairs if there is only one or none individuals from school of informatics
poinf6yr_edges_only = [(au1, au2) for (au1, au2) in unique_edges_6yr
                       if au1 in lookup_poinf.index and au2 in lookup_poinf.index]
print('size of collaboration network (number of edges): ',
      len(poinf6yr_edges_only))
size of collaboration network (number of edges):  361

Note the large decrease in edges with and without external individuals (from 19,890 to 360)

Generating infnet-6yr

In [89]:
g_infnet6yr = nx.from_edgelist(poinf6yr_edges_only)
pos_6yr = nx.spring_layout(g_infnet6yr)
In [217]:
print(nx.info(g_infnet6yr))
Name: 
Type: Graph
Number of nodes: 184
Number of edges: 361
Average degree:   3.9239

Generating adjacency matrix

In [198]:
NODES_ORDER = []
with open(os.path.join(DATA_DIR, 'poinf_collabgraph_1997-2017.txt'), 'r') as f:
    for node in f:
        NODES_ORDER.append(node.strip())
In [199]:
len(NODES_ORDER)
Out[199]:
195
In [200]:
adj_mat_6yr, fig, order_6yr = create_adj_mat(
    g_infnet6yr, NODES_ORDER, draw=True, use_order=False)
if SAVE_GRAPHS:
    plt.savefig(
        'IMG/infnet6yr_adj_mat_order6yr.png',
        #         'IMG/infnet6yr_adj_mat_order20yr.png',
        format='png',
        transparent=True,
        bbox_inches='tight')
In [97]:
print((adj_mat_6yr.shape))
print((len(order_6yr)))
with open(os.path.join(DATA_DIR, 'poinf_collabgraph_2012-2017.txt'), 'w') as f:
    f.write("\n".join(list(order_6yr)))
(184, 184)
184
In [202]:
# adj_mat.dump(os.path.join(DATA_DIR,'mat','infnet6yrs-adj-mat.order20yr.pkl'))
adj_mat_6yr.dump(os.path.join(DATA_DIR, 'mat', 'infnet6yrs-adj-mat.order6yr.pkl'))

Graphing Infnet (6yr model)

Spring Layout

In [101]:
f = plt.figure(figsize=(8,8))
ax=f.add_subplot(111)
nx.draw(
    g_infnet6yr,
    pos=pos_6yr,
    node_size=30,
    ax=ax,
    node_color=color_by_inst(g_infnet6yr, lookup_poinf))
f.tight_layout()
In [102]:
f.savefig('IMG/infnet6yr_spring.png',format='png', bbox_inches='tight')
In [128]:
f = plt.figure(figsize=(8,8))
ax=f.add_subplot(111)
nx.draw(
    g_infnet6yr,
    pos=pos_full,
    node_size=50,
    alpha=.8,
    edge_color='#999966',
    node_color=color_by_inst(g_infnet6yr, lookup_poinf))
f.tight_layout()
In [129]:
f.savefig('IMG/infnet6yr_spring_winfnet20yrpos.png',format='png',bbox_inches='tight')

Circular layout for each class

In [130]:
draw_default_layout(g_infnet6yr, lookup_poinf, 'infnet6yr')
2018-04-05 17:32:07,362 : INFO : SAVE_GRAPHS: False

Circular Layout

In [203]:
draw_circular_layout(g_infnet6yr, lookup_poinf, file_prefix='infnet6yr', SAVE_GRAPHS=False)
2018-04-05 18:49:38,820 : INFO : SAVE_GRAPHS: True

Shell Layout

In [133]:
nlist = [[] for i in range(12)]
for node in g_infnet6yr:
    c = int(lookup_poinf.institute_class.loc[[str(node)]])
    nlist[c].append(str(node))
# sort the list according to the size of the list, so that larger circles are outside.
nlist_dict = {len(a): [] for a in nlist}
for lst in nlist:
    nlist_dict[len(lst)].extend(lst)

length = list(nlist_dict.keys())
# length.sort(reverse=True)
_nlist = []
for l in length:
    _nlist.append(nlist_dict[l])
_nlist = [a for a in _nlist if len(a) > 0]
In [135]:
# visualise:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
ax.axis('off')
nx.draw_networkx(
    g_infnet6yr,
    pos=nx.shell_layout(g_infnet6yr, _nlist),
    with_labels=False,
    ax=ax,
    edge_color='#999966',
    node_size=40,
    node_color=color_by_inst(g_infnet6yr,lookup_poinf))

# if SAVE_GRAPHS:
    #     ax.set_title('Informatics Collaboration Network from 1997-2017')
    #     ax = add_inst_labels(ax) #You can include the label by uncommenting `ax = add_inst_label(ax)`
    #     plt.savefig("IMG/infnet6yr_shell.pdf", format='pdf', bbox_inches="tight")
#     plt.savefig(
#         "IMG/infnet6yr_shell.png",
#         format='png',
#         bbox_inches="tight",
#         transparent=True,
#     )
In [137]:
print(nx.info(g_infnet6yr))
Name: 
Type: Graph
Number of nodes: 184
Number of edges: 361
Average degree:   3.9239

Statistics

Degree distribution

In [216]:
nx.degree_histogram(g_infnet6yr)
Out[216]:
[0, 38, 37, 25, 22, 16, 23, 3, 6, 4, 2, 3, 3, 0, 0, 0, 0, 1, 1]
In [213]:
ax, degree_seq = degree_dist(g_infnet6yr)
In [214]:
print(degree_seq[0], degree_seq[1])
0 38
In [140]:
# $F(d) = (\frac{d}{d_{min}})^{-(\alpha-1)}$
fig = power_law_fit(degree_seq)
if SAVE_GRAPHS:
    plt.savefig("IMG/infnet6yr_degreeDist.pdf", format='pdf', bbox_inches="tight")
Values less than or equal to 0 in data. Throwing out 0 or negative values
Calculating best minimal value for power law fit
/afs/inf.ed.ac.uk/user/s14/s1450710/miniconda3/envs/infnet3/lib/python3.6/site-packages/powerlaw.py:697: RuntimeWarning:

invalid value encountered in true_divide

/afs/inf.ed.ac.uk/user/s14/s1450710/miniconda3/envs/infnet3/lib/python3.6/site-packages/powerlaw.py:697: RuntimeWarning:

divide by zero encountered in true_divide

Clustering coefficient

In [141]:
cc = clustering_coeff(g_infnet6yr)
print('average clustering coefficient: ', cc[1])
average clustering coefficient:  0.42276841072397336
In [210]:
nx.average_clustering(g_infnet6yr,count_zeros=False)
Out[210]:
0.6030185083194657
In [142]:
# The number of triangles for each nodes
transitivity_graph = nx.transitivity(g_infnet6yr)
print('Transitivity:', transitivity_graph)
Transitivity: 0.3971061093247588

Connected Components

In [143]:
gccs, percentage = generateGCC(g_infnet6yr)
component 1: 94.57%
component 2: 1.09%
component 3: 1.09%
component 4: 1.09%
component 5: 1.09%
component 6: 1.09%
In [145]:
# display the connected components:
fig = plt.figure(figsize=(10, 10))
num_col = len(gccs) / 2
for i, g in enumerate(gccs, 1):
    percent = percentage[i - 1]
    ax = fig.add_subplot(num_col, 2, i)
    ax.axis('off')
    ax.set_title('Component {} ({:.2%})'.format(i, percent))
    nx.draw_networkx(
        g,
        node_color=color_by_inst(g, lookup_poinf),
        ax=ax,
        with_labels=False,
        node_size=40,
        pos=pos_6yr)
if SAVE_GRAPHS:
    plt.savefig("IMG/infnet6yr_CC.pdf", format='pdf', bbox_inches="tight")

Giant Connected Component

In [147]:
main_gcc = gccs[0]
pos_gcc = nx.spring_layout(main_gcc)
In [153]:
# Draw the graph:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
# ax = add_inst_labels(ax)
ax.axis('off')
nx.draw_networkx(
    main_gcc,
    pos=pos_full,
#     pos=pos_gcc,
    with_labels=False,
    ax=ax,
    node_size=40,
    node_color=color_by_inst(main_gcc, lookup_poinf)
)
if SAVE_GRAPHS:
    ax.set_title('Largest connected component ({:.2%})'.format(percentage[0]));
    plt.savefig("IMG/infnet6yr_LargestCC.pdf", format='pdf', bbox_inches="tight")
    
In [154]:
print('number of nodes in largest connected component:', len(main_gcc))
print(nx.info(main_gcc))
number of nodes in largest connected component: 174
Name: 
Type: Graph
Number of nodes: 174
Number of edges: 356
Average degree:   4.0920

Degree distribution

In [155]:
ax, degree_seq = degree_dist(main_gcc)
In [156]:
# $F(d) = (\frac{d}{d_{min}})^{-(\alpha-1)}$
fig = power_law_fit(degree_seq)
if SAVE_GRAPHS:
    plt.savefig("IMG/infnet6yr_LargestCC_degreeDist.pdf", format='pdf', 
            bbox_inches="tight")
Values less than or equal to 0 in data. Throwing out 0 or negative values
Calculating best minimal value for power law fit
/afs/inf.ed.ac.uk/user/s14/s1450710/miniconda3/envs/infnet3/lib/python3.6/site-packages/powerlaw.py:697: RuntimeWarning:

invalid value encountered in true_divide

Clustering Coefficient

In [157]:
cc = clustering_coeff(main_gcc)
print('average clustering coefficient: ', cc[1])
average clustering coefficient:  0.4470654458230524

Average Path & Diameter

In [158]:
nx.average_shortest_path_length(main_gcc) # AVERAGE PATH
Out[158]:
4.990698292472261
In [159]:
# Diameter:
print('Diameter of graph:', nx.diameter(main_gcc))
Diameter of graph: 11

Community detection

In [ ]:
# Community detection using modularity
parts = community.best_partition(main_gcc)
values = [parts.get(node) for node in main_gcc.nodes()]  
# assign each node to the community they belong to
In [ ]:
# Plot the graphs side by side:
fig = plt.figure(figsize=(20, 10))
fig.suptitle('Comparison between actual institutes and communities detected\n(Colors of nodes for graph on right is independent of those on left)')
ax1 = fig.add_subplot(121)
ax1.set_title('Actual communities in informatics collaboration network')
ax1.axis('off')
nx.draw_networkx(
    main_gcc,
    pos=pos_gcc,
    with_labels=False,
    ax=ax1,
    node_size=40,
    node_color=color_by_inst(main_gcc)
)


ax2 = fig.add_subplot(122)
ax2.set_title('Communities detected using modularity')
ax2.axis('off')
nx.draw_networkx(
    main_gcc,
    pos=pos_gcc,
    cmap=plt.get_cmap("rainbow"),
    ax=ax2,
    node_color=values,
    node_size=40,
    with_labels=False)
plt.savefig("IMG/infnet6yr_LargestCC_commCompare.pdf", format='pdf', 
            bbox_inches="tight")
In [ ]:
num_comm = len(set(parts.values()))
print('Number of communities detected =', num_comm)
In [ ]:
# plot each community:
fig = plt.figure(figsize=(10,30))
comms = set(parts.values())
for i, comm_id in enumerate(comms, 1):
    ax = fig.add_subplot(6,2,i)
    # find those nodes belonging to this community:
    nodes_from_comm = [node_id for (node_id, c) in parts.items() if c == comm_id]
    # Generate the subgrph belonging to these nodes:
    subG = nx.subgraph(main_gcc, nodes_from_comm)
    ax.axis('off')
    ax.set_title('Community {}'.format(comm_id))
    
    nx.draw_networkx(
        subG,
        pos=pos_gcc,
        ax=ax,
        node_color=color_by_inst(nodes_from_comm),
        node_size=40,
        with_labels=False)
ax = add_inst_labels(ax)
plt.savefig("IMG/infnet6yr_LargestCC_communities.pdf", format='pdf', 
            bbox_inches="tight")

Influential individuals

In [ ]:
bt = between_parallel(main_gcc)
In [ ]:
top = 11
In [ ]:
# we need to find the index of these max_nodes:
_nodes = list(main_gcc.nodes())
In [ ]:
max_nodes = sorted(bt.items(), key=lambda v: -v[1])[:top]
max_nodes
In [ ]:
bt_values = [10] * len(main_gcc.nodes())
bt_colors = ['xkcd:black'] * len(main_gcc.nodes())
for max_key, max_val in max_nodes:
    bt_values[_nodes.index(max_key)] = (max_val * 150)**2.2
    bt_colors[_nodes.index(max_key)] = inst_by_color[int(
        pd_poinf.institute_class.loc[[str(max_key)]])]
In [ ]:
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
plt.axis("off")
plt.suptitle('The top 11 influential individuals in the GCC')
nx.draw_networkx(
    main_gcc,
    pos=pos_gcc,
    ax=ax,
    node_color=bt_colors,
    node_size=bt_values,
    with_labels=False)
plt.savefig("IMG/infnet6yr_influencer.pdf", format='pdf', 
            bbox_inches="tight")
In [ ]:
top_ids = [a[0] for a in max_nodes]
pd_poinf.loc[top_ids]

Weighted 6yr Model

IN this section, we explore a version of the weight simple undirected grap of the 6yr model. The calculation of the weight of an edge is the sum of contributions by each author pair for all their publications (publication which involve both of them (and others).

Contributions are calculated as $$\text{contribution}(u,v) = \frac{1}{\text{size of complete graph}} = \frac{1}{{N}\choose{2}} = \frac{1}{\frac{N(N-1)}{2}} = \frac{2}{N(N-1)}$$ where $N$ is the number of authors for a publication.

Weight of edge between authors $u$ and $v$ $$weight(u,v) = \sum_{i=1}^{N}\text{contribution}_{i}(u,v)$$

In [160]:
def get_edge_weight(authors):
    # function to calculate the edge weight for each publication
    if len(authors) > 1:
        n = len(authors)
        k = 2. / (n * (n - 1))
    else:
        k = 0.
    return k
In [161]:
combined_yrs['weight'] = combined_yrs.apply(
    lambda row: get_edge_weight(row.collab_id), axis=1)
In [162]:
# Add the edges to the network with weight attribute
g_poinf_weighted = nx.Graph()
for row in combined_yrs.itertuples():
    edgelist = row.edges
    weight = row.weight
    for (a, b) in edgelist:
        if a in lookup_poinf.index and b in lookup_poinf.index:
            g_poinf_weighted.add_edge(a, b, weight=weight)
In [163]:
edgewidth = [
    d['weight'] * 2. for (u, v, d) in g_poinf_weighted.edges(data=True)
]  # weights for each edges
print(nx.info(g_poinf_weighted))
Name: 
Type: Graph
Number of nodes: 184
Number of edges: 361
Average degree:   3.9239
In [164]:
assert not set(g_infnet6yr.nodes).difference(
    set(g_poinf_weighted.nodes
        )), "Same number of nodes for both 6yr model should be observer"

Generating adjacency matrix

In [165]:
_NODES_ORDER = open(os.path.join(DATA_DIR,'poinf_collabgraph_1997-2017.txt'), 'r').readlines()
NODES_ORDER = [n.strip() for n in _NODES_ORDER]
In [166]:
# To ensure that the id corresponding to each individual is the same as for those,
# First, create a dump of the id that corresponds to the list of individual:
adj_mat, fig, order = create_adj_mat(
    g_poinf_weighted, NODES_ORDER, draw=True, use_order=False, weighted=True)
In [167]:
print(adj_mat.shape)
print(len(order))
with open(os.path.join(DATA_DIR, 'poinf_collabgraph_2012-2017_weighted.txt'), 'w') as f:
    f.write("\n".join(list(order)))
(184, 184)
184
In [168]:
# save the weighted matrix:
adj_mat.dump(os.path.join(DATA_DIR,'mat','infnet6yrs-weighted-adj-mat.order6yr.pkl'))
In [169]:
# Take a look at the distribution of edgewidth
fig = plt.hist(edgewidth, bins=100)
# plt.savefig('IMG/infnet6yrw_tofWeights.pdf', format='pdf', bbbox_inches="tight")

Graphing Weighted Network (6yr)

Circular layout for each class

In [173]:
draw_default_layout(
    g_poinf_weighted,
    lookup_poinf,
    'infnet6yrw',
    with_weight=True,
    scale=3,
    SAVE_GRAPHS=True)
2018-04-05 17:38:13,822 : INFO : SAVE_GRAPHS: 1

Circular Layout

In [174]:
edges_across = []
for (a, b) in g_poinf_weighted.edges:
    c_a = int(lookup_poinf.institute_class.loc[[str(a)]])
    c_b = int(lookup_poinf.institute_class.loc[[str(b)]])
    if c_a != c_b:
        if (b, a) not in edges_across:
            edges_across.append((a, b))
In [218]:
draw_circular_layout(
    g_poinf_weighted, lookup_poinf, file_prefix='infnet6yrw', with_weight=True, scale=5, SAVE_GRAPHS=True)
2018-04-10 01:53:03,038 : INFO : SAVE_GRAPHS: True
In [295]:
_join_names = lambda x_y: " ".join([x_y[0], x_y[1]])
lookup_poinf['full_name'] = list(
    map(_join_names, list(zip(lookup_poinf['first_name'], lookup_poinf['last_name']))))
In [309]:
pos = nx.spring_layout(g_poinf_weighted, k=.14)
In [310]:
g=g_poinf_weighted
scale=5
edgewidth = [d['weight'] * float(scale) for (u, v, d) in g.edges(data=True)]

# pos = nx.kamada_kawai_layout(g_poinf_weighted, pos=pos,scale=10)

f= plt.figure(figsize=(20,18))
ax=f.add_subplot(111)
label_dict = {}
for n in g_poinf_weighted.nodes:
    label_dict[n] = lookup_poinf.loc[n].full_name
nx.draw(
    g_poinf_weighted,
    pos=pos,
    ax=ax,
    width=edgewidth,
    node_size=80,
    edge_color='#999966',
    node_color=color_by_inst(g_poinf_weighted, lookup_poinf))
add_inst_labels(ax)
ax.legend(
    loc='upper center',
    bbox_to_anchor=(0.5, 1.05),
    ncol=3,
    title='institutes',
    fontsize=12,
    fancybox=True,
    shadow=False)
nx.draw_networkx_labels(
    g_poinf_weighted, pos=pos, labels=label_dict, ax=ax,font_size=12.5, font_weight='bold');
In [311]:
f.tight_layout()
f.savefig('IMG/infnet6yrw_spring_wlabels.pdf',format='pdf',bbox_inches='tight')

Spring layout

In [289]:
# Draw the graph:
fig = plt.figure(figsize=(18, 18))
ax = fig.add_subplot(111)
# ax=add_inst_labels(ax)
ax.axis('off')
nx.draw_networkx_nodes(
    g_poinf_weighted,
    pos=pos,
    with_labels=False,
    ax=ax,
    node_size=90,
    node_color=color_by_inst(g_poinf_weighted, lookup_poinf))
nx.draw_networkx_edges(
    g_poinf_weighted,
    pos,
    width=edgewidth,
)
Out[289]:
<matplotlib.collections.LineCollection at 0x7f325071a080>
In [291]:
fig.tight_layout()
fig.savefig('IMG/infnet6yrw_springv2.pdf',format='pdf',bbox_inches='tight')
In [182]:
# Draw the graph:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
# ax=add_inst_labels(ax)
ax.axis('off')
nx.draw_networkx_nodes(
    g_poinf_weighted,
    pos=pos_full,
    with_labels=False,
    ax=ax,
    node_size=20,
    node_color=color_by_inst(g_poinf_weighted, lookup_poinf))
nx.draw_networkx_edges(
    g_poinf_weighted,
    pos_full,
    width=edgewidth,
)
Out[182]:
<matplotlib.collections.LineCollection at 0x7f3251f56f98>
In [184]:
fig.savefig('IMG/infnet6yrw_withinfnet20yrpos.png',format='png',bbox_inches='tight')

Statistics

Degree distribution is the same as 3.2.1

Clustering

In [188]:
print('Average clustering coeff: ',
      nx.average_clustering(g_poinf_weighted, weight='weight'))
Average clustering coeff:  0.05543846114748951

This is lower than the unweighted graph ( 0.421964851597)!

Connected Components

In [189]:
gccs, percentage = generateGCC(g_poinf_weighted)
component 1: 94.57%
component 2: 1.09%
component 3: 1.09%
component 4: 1.09%
component 5: 1.09%
component 6: 1.09%
In [191]:
# display the connected components:
fig = plt.figure(figsize=(10, 10))
num_col = len(gccs) / 2
for i, g in enumerate(gccs, 1):
    percent = percentage[i - 1]
    ax = fig.add_subplot(num_col, 2, i)
    ax.axis('off')
    ax.set_title('Component {} ({:.2%})'.format(i, percent))
    nx.draw_networkx_nodes(
        g,
        node_color=color_by_inst(g, lookup_poinf),
        ax=ax,
        with_labels=False,
        node_size=20,
        pos=pos_6yr)
    g_edgewidth = [d['weight'] for (u, v, d) in g.edges(data=True)]
    nx.draw_networkx_edges(g, pos_6yr, width=np.multiply(g_edgewidth, 3))

plt.savefig("IMG/infnetweighted_CC.pdf", format='pdf', bbox_inches="tight")
In [192]:
main_gcc = gccs[0]
In [193]:
print('number of nodes in largest connected component:', len(main_gcc))
print(nx.info(main_gcc))
number of nodes in largest connected component: 174
Name: 
Type: Graph
Number of nodes: 174
Number of edges: 356
Average degree:   4.0920

Giant Connected Component

In [194]:
# Draw the graph:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
# ax = add_inst_labels(ax)
ax.axis('off')
nx.draw_networkx_nodes(
    main_gcc,
    pos=pos_gcc,
    with_labels=False,
    ax=ax,
    node_size=20,
    node_color=color_by_inst(main_gcc, lookup_poinf))
main_gcc_edgewidth = [d['weight'] for (u, v, d) in main_gcc.edges(data=True)]
nx.draw_networkx_edges(
    main_gcc, pos_gcc, width=np.multiply(main_gcc_edgewidth, 3))
ax.set_title('Largest connected component ({:.2%})'.format(percentage[0]))

if SAVE_GRAPHS:
    plt.savefig(
    "IMG/infnet6yrw_LargestCC.pdf", format='pdf', bbox_inches="tight")
In [195]:
print('Average clustering coeff: ',
      nx.average_clustering(main_gcc, weight='weight'))
Average clustering coeff:  0.05862457960424178
In [196]:
print('Average shortest path length: ',
      nx.average_shortest_path_length(main_gcc, weight='weight'))
Average shortest path length:  0.6684684785965641

Community Detection

In [ ]:
# Community detection using modularity
parts = community.best_partition(main_gcc)
values = [parts.get(node) for node in main_gcc.nodes()]
# assign each node to the community they belong to
In [ ]:
print('Number of communities detected: ', len(set(values)))
In [ ]:
# Plot the graphs side by side:
fig = plt.figure(figsize=(20, 10))
fig.suptitle(
    'Comparison between actual institutes and communities detected\n(Colors of nodes for graph on right is independent of those on left)'
)
ax1 = fig.add_subplot(121)
ax1.set_title('Actual communities in informatics collaboration network')
ax1.axis('off')
nx.draw_networkx_nodes(
    main_gcc,
    pos=pos_gcc,
    with_labels=False,
    ax=ax1,
    node_size=20,
    node_color=color_by_inst(main_gcc))
nx.draw_networkx_edges(
    main_gcc, pos_gcc, width=np.multiply(main_gcc_edgewidth, 3))

ax2 = fig.add_subplot(122)
ax2.set_title('Communities detected using modularity')
ax2.axis('off')
nx.draw_networkx_nodes(
    main_gcc,
    pos=pos_gcc,
    cmap=plt.get_cmap("rainbow"),
    ax=ax2,
    node_color=values,
    node_size=20,
    with_labels=False)
nx.draw_networkx_edges(
    main_gcc, pos_gcc, width=np.multiply(main_gcc_edgewidth, 3))
plt.savefig(
    "IMG/infnetweight_LargestCC_commCompare.pdf",
    format='pdf',
    bbox_inches="tight")

Influential Individuals

We use the top scores for the eigenvector centrality to determine who are the most influential individuals

In [ ]:
evect_centrality = nx.eigenvector_centrality_numpy(main_gcc, weight='weight')
In [ ]:
max_nodes = sorted(evect_centrality.items(), key=lambda v: -v[1])[:top]
max_nodes
In [ ]:
bt_values = [10] * len(main_gcc.nodes())
bt_colors = ['xkcd:black'] * len(main_gcc.nodes())
for max_key, max_val in max_nodes:
    bt_values[_nodes.index(max_key)] = (max_val * 100)**2
    bt_colors[_nodes.index(max_key)] = inst_by_color[int(
        pd_poinf.institute_class.loc[[str(max_key)]])]
In [ ]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
plt.axis("off")
plt.suptitle('The top 11 influential individuals in the GCC')
nx.draw_networkx_nodes(
    main_gcc,
    pos=pos_gcc,
    ax=ax,
    node_color=bt_colors,
    node_size=bt_values,
    with_labels=False)
nx.draw_networkx_edges(
    main_gcc, pos_gcc, width=np.multiply(main_gcc_edgewidth, 3))
plt.savefig(
    "IMG/infnetweight_influencer.pdf", format='pdf', bbox_inches="tight")
In [ ]:
top_ids = [a[0] for a in max_nodes]
pd_poinf.loc[top_ids]